The datasets consists of several medical predictor variables and one target variable (Outcome). Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and more.
Create a dashboard in tableau by choosing appropriate chart types and metrics useful for the business. The dashboard must entail the following:
!wget https://raw.githubusercontent.com/snikhil17/Simplilearn-Healthcare-Capstone/main/health%20care%20diabetes.csv
!pip install optuna
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import itertools
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
import statsmodels.api as sm
import optuna
from sklearn import preprocessing
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
ExtraTreesClassifier, BaggingClassifier,
GradientBoostingClassifier, GradientBoostingClassifier)
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.combine import SMOTETomek
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('/content/health care diabetes.csv')
df.head()
df.info()
df.describe()
There are no missing value as such but as instructed in the tasks, we have value = 0 for some variable where 0 is not possible, hence we will consider them as missing values and impute them in further steps. Columns that cannot have 0 as their value (and include 0): Glucose, BloodPressure, SkinThickness, Insulin, BMI
We have only integer and float type of data types. Out of which BMI and DiabetesPedigreeFunction are continuous and rest are discrete variables.
na_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[na_cols] =df[na_cols].apply(lambda x: x.replace(0, np.nan))
df.head()
((df.isnull().sum() / df.shape[0])*100).to_frame().rename({0: 'Missing Values percentage'}, axis = 1).style.background_gradient('gist_heat_r')
missing_val_df = ((df.isnull().sum() / df.shape[0])*100).to_frame().reset_index().rename({'index': 'columns', 0: 'Missing Values percentage'}, axis = 1)
plt.figure(figsize = (10,8))
sns.barplot(data = missing_val_df, x = 'Missing Values percentage', y = 'columns',palette='dark')
plt.title('Missing Values percentage', fontsize = 20, fontweight = 'bold', color = '#bd0b0b')
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(fontsize = 12, fontweight = 'bold', color = 'navy')
plt.yticks(fontsize = 12, fontweight = 'bold', color = 'navy');
plt.figure(figsize = (10,8))
sns.countplot(data = df, x = 'Outcome', palette='dark')
plt.title('Outcome', fontsize = 20, fontweight = 'bold', color = '#bd0b0b')
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(fontsize = 12, fontweight = 'bold', color = 'navy')
plt.yticks(fontsize = 12, fontweight = 'bold', color = 'navy');
"""Imputing BMI using Iterative Imputer"""
it_imputer = IterativeImputer(max_iter=100)
iterimp = it_imputer.fit_transform(df)
"""Create train test imputed dataframe"""
df_eda = pd.DataFrame(iterimp, columns=df.columns)
df_eda.isnull().sum()
""""Saving Correlation Matrix data to create correlation matrix in Tableau"""
cor_m = df_eda.corr().unstack().to_frame().reset_index().rename({'level_0':'level_1_col','level_1':'level_2_col', 0:'Correlation' }, axis =1)
cor_m.to_csv('Correlation_Pivot.csv', index = False)
"""Saving File for Tableau"""
df_tab = df_eda.copy()
df_tab = df_tab.reset_index().rename({'index': 'RowNumber'}, axis = 1)
df_tab.to_csv('data_diabetes.csv', index = False)
for i in ['Pregnancies', 'Age']:
df_eda[i] = df_eda[i].astype(int)
plt.figure(figsize = (20,20))
for i,col in enumerate(df_eda.drop(['Pregnancies', 'Age'], axis = 1)):
plt.subplot(7,2,i+1)
sns.histplot(x = col, data = df_eda, hue = 'Outcome', palette = 'dark' )
plt.xticks(rotation = 0)
plt.title(col, fontsize = 15, fontweight = 'bold', color = '#bd0b0b')
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(fontsize = 12, fontweight = 'bold', color = 'brown')
plt.yticks(fontsize = 12, fontweight = 'bold', color = 'brown')
plt.tight_layout();
"""Count plots of Pregnencies and Age with hue as Outcome"""
plt.figure(figsize = (15,15))
for i,col in enumerate(['Pregnancies', 'Age']):
plt.subplot(2,1,i+1)
sns.countplot(x = col, data = df_eda, hue = 'Outcome', palette = 'dark' )
plt.xticks(rotation = 0)
plt.title(col, fontsize = 15, fontweight = 'bold', color = '#bd0b0b')
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(fontsize = 12, fontweight = 'bold', color = 'brown')
plt.yticks(fontsize = 12, fontweight = 'bold', color = 'brown')
plt.tight_layout();
"""Correlation Matrix"""
corr_matrix=df_eda.corr()
matrix = np.tril(corr_matrix)
plt.figure(figsize = (10,10))
sns.heatmap(corr_matrix.T, mask=matrix, square=True, cmap = 'RdBu', annot = True);
"""Pie Chart of Pregnencies and Outcome"""
for i in ['Pregnancies', 'Outcome']:
df.groupby(i)[i].count().plot.pie(y = i, figsize=(10, 10))
plt.show()
plt.tight_layout();
plt.figure(figsize = (25,50))
for i,col in enumerate(df_eda.drop(['Outcome'], axis =1).columns):
plt.subplot(5,2,i+1)
sns.boxenplot(x = 'Outcome',y = col, data = df_eda,palette = 'dark')
plt.xticks(rotation = 0)
plt.title(f'{col} vs Outcome', fontsize = 20, fontweight = 'bold', color = '#bd0b0b')
plt.xlabel(" ")
plt.ylabel(" ")
# plt.legend(['No-Purchase', 'Purchase'])
plt.xticks(fontsize = 15, fontweight = 'bold', color = 'brown')
plt.yticks(fontsize = 15, fontweight = 'bold', color = 'brown')
plt.tight_layout();
plt.show()
"""Creating a list of Outliers columns"""
outliers_cols = []
for col in df_eda.drop(['Outcome'], axis = 1).columns:
q25, q75 = np.percentile(df_eda[col], 25), np.percentile(df_eda[col], 75)
iqr = q75 - q25
outlier_th = iqr*1.5
lower_th = q25 - outlier_th
upper_th = q75 - outlier_th
if any(df_eda[col].values < lower_th) or any(df_eda[col].values > upper_th):
outliers_cols.append(col)
print(outliers_cols)
"""Treating Outliers"""
for col in outliers_cols:
percentiles = df_eda[col].quantile([0.01, 0.99]).values
df_eda[col] = np.clip(df_eda[col], percentiles[0], percentiles[1])
"""Checking the distribution using Boxen plots after capping and flooring Outliers"""
plt.figure(figsize = (25,50))
for i,col in enumerate(df_eda.drop([ 'Outcome'], axis =1).columns):
plt.subplot(5,2,i+1)
sns.boxenplot(x = 'Outcome',y = col, data = df_eda,palette = 'dark')
plt.xticks(rotation = 0)
plt.title(f'{col} vs Outcome', fontsize = 20, fontweight = 'bold', color = '#bd0b0b')
plt.xlabel(" ")
plt.ylabel(" ")
# plt.legend(['No-Purchase', 'Purchase'])
plt.xticks(fontsize = 15, fontweight = 'bold', color = 'brown')
plt.yticks(fontsize = 15, fontweight = 'bold', color = 'brown')
plt.tight_layout();
plt.show()
plt.figure(figsize = (18,18))
for i in enumerate(df_eda.drop(['Outcome'], axis = 1).columns):
plt.subplot(4,2,i[0]+1)
sns.kdeplot(data = df_eda, x = i[1], hue = 'Outcome', fill = 'dark', palette = 'dark' )
plt.xlabel(i[1],fontsize = 15)
plt.ylabel(" ")
plt.xticks(rotation = 45)
plt.tight_layout()
sns.pairplot(df_eda, hue = 'Outcome')
Pearson's Correlation Coefficient helps you find out the relationship between two quantities. It gives you the measure of the strength of association between two variables. The value of Pearson's Correlation Coefficient can be between -1 to +1. 1 means that they are highly correlated and 0 means no correlation.
col_num = df_eda.columns
fig, axs = plt.subplots(len(col_num), 2,
figsize=(15, len(df_eda.columns)*6))
for i, col in enumerate(col_num):
sns.histplot(df_eda[col], ax=axs[i, 0])
sm.qqplot(df_eda[col].dropna(), line="s", ax=axs[i, 1], fmt='b')
axs[i, 1].set_title(col)
plt.tight_layout()
"""Transforming the Variables to attain Normal Distribution"""
pt = preprocessing.PowerTransformer()
for col in df_eda.drop(['Outcome', 'Pregnancies'], axis =1).columns:
df_eda[col] = pt.fit_transform(df_eda[col].values.reshape(-1,1))
fig, axs = plt.subplots(len(col_num), 2,
figsize=(15, len(df_eda.columns)*6))
for i, col in enumerate(col_num):
sns.histplot(df_eda[col], ax=axs[i, 0])
sm.qqplot(df_eda[col].dropna(), line="s", ax=axs[i, 1], fmt='b')
axs[i, 1].set_title(col)
plt.tight_layout()
"""Label Encoding Pregnancies Variable"""
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(list(df_eda['Pregnancies'].astype('str').values))
df_eda['Pregnancies'] = label_encoder.transform(df_eda['Pregnancies'].astype('str'))
X = df_eda.drop(['Outcome'], axis =1)
Y = df_eda[['Outcome']]
# Implementing Oversampling for Handling Imbalanced
smk = SMOTETomek(random_state=42)
col_X = list(X.columns)
col_y = list(Y.columns)
X_res,y_res=smk.fit_resample(X,Y)
print(X_res.shape,y_res.shape)
X = pd.DataFrame(X_res, columns = col_X)
Y = pd.DataFrame(y_res, columns = col_y)
df = X
df['Outcome'] = y_res
print(f"shape of dataset resampling: {df_eda.shape}")
from collections import Counter
print(f'Original dataset Outcome variable count {Counter(df_eda["Outcome"])}')
print(f'Resampled dataset Outcome variable count {Counter(df["Outcome"])}')
Confusion matrix: A confusion matrix is a table that is often used to describe the performance of a classification model true positives (TP): These are cases in which we predicted yes.
true negatives (TN): We predicted no, and they don't have the disease.
false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")
precision - What proportion of positive identifications was actually correct?
recall - What proportion of actual positives was identified correctly?
F1 Score:
As can be seen in the above figure, the hyperparameter tuner is external to the model and the tuning is done before model training. The result of the tuning process is the optimal values of hyperparameters which is then fed to the model training stage. Let me now introduce Optuna, an optimization library in Python that can be employed for hyperparameter optimization.
Optuna is a software framework for automating the optimization process of these hyperparameters. It automatically finds optimal hyperparameter values by making use of different samplers such as grid search, random, bayesian, and evolutionary algorithms. Let me first briefly describe the different samplers available in optuna.
The following features of optuna encouraged us to use it for hyperparameter tuning for the problems we were trying to solve!
# Function to plot Confusion Matrix (to be used later).
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.copper_r):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
# Hide grid lines
plt.grid(False)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
"""Using the Data for Model Building"""
df['Outcome'] = df['Outcome'].astype(int)
data_proc = df.copy()
# data_proc = df_eda.drop('isTrain', axis =1).copy()
"""Pre-Processed Data"""
data_proc.head()
"""Creating Train and Test Data"""
df_train, df_test = train_test_split(data_proc, test_size= 0.15, stratify=data_proc['Outcome'], random_state = 42)
"""Creating Features and Label"""
features = df_train.drop('Outcome', axis =1)
label = df_train.Outcome
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
def objective(trial):
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # assigning xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # assigning xvalid, yvalid
# Modelling
params = {'n_neighbors' : trial.suggest_int('n_neighbors', 2, 30)
,'algorithm' : trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
,'weights' : trial.suggest_categorical('weights', ['uniform','distance'])
,'metric' : trial.suggest_categorical('metric', ['minkowski','euclidean'])
, 'leaf_size' : trial.suggest_int('leaf_size', 10, 300)
}
model = KNeighborsClassifier(**params) # Initialization of KNeighborsClassifier Class
model.fit(xtrain,ytrain) # Training the Model on training set
# Predictions and Evaluation
preds_valid = model.predict_proba(xvalid)[:, 1] # Validating the model on Validation data.
roc_auc_score = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating the model using roc_aucc_curve
print(fold, roc_auc_score)
return roc_auc_score
study_knn = optuna.create_study(direction='maximize') # direction = "maximize", optuna will try to maximize the roc_auc_score
study_knn.optimize(objective, n_trials=100) # Running our objective function for 15 trials.
trial_knn = study_knn.best_trial
print('roc_auc_score: {}'.format(trial_knn.value))
print("Best hyperparameters: {}".format(trial_knn.params))
optuna.visualization.plot_optimization_history(study_knn)
optuna.visualization.plot_slice(study_knn)
final_test_predictions_knn = [] #to store final test predictions
final_valid_predictions_knn = [] # to store final validation predictions
scores = [] #to store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
params = {'n_neighbors': 17, 'algorithm': 'ball_tree', 'weights': 'distance',
'metric': 'minkowski', 'leaf_size': 242}
model_knn = KNeighborsClassifier(**params) # instantiate KNeighborsClassifier Class
model_knn.fit(xtrain, ytrain) # Training the model on training set
preds_valid = model_knn.predict_proba(xvalid)[:, 1] # Predicting Validation set
test_preds = model_knn.predict_proba(xtest)[:, 1] # Predicting Test Set
final_test_predictions_knn.append(test_preds) # Appending test predictions to list
final_valid_predictions_knn.append(preds_valid) # Appending valid predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating valid predictions using roc_auc_curve
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating test predictions using roc_auc_curve
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # printing fold and respective roc_auc for validation set
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # printing fold and respective roc_auc for test set
scores.append(roc_auc_score_valid) # Appending roc_auc_score to list.
print(np.mean(scores), np.std(scores)) # printing mean and standard deviation of scores
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_knn), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_knn), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_knn), axis=1)).round()))
svc = SVC( kernel="linear", class_weight='balanced', probability=True )
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier()
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier( random_state=7)
abc = AdaBoostClassifier( random_state=7)
bc = BaggingClassifier(random_state=7)
etc = ExtraTreesClassifier(random_state=7)
gbdt = GradientBoostingClassifier(random_state=7)
xgb = XGBClassifier(random_state=7)
mlp = MLPClassifier(random_state=7,tol=1e-4)
clfs = {'SVC':svc,
'KN' : knc,
'DT': dtc,
'LR': lrc,
'RF': rfc,
'AdaBoost': abc,
'BgC': bc,
'ETC': etc,
'GBDT':gbdt,
'xgb':xgb,
'mlp': mlp
}
def train_classifier(clf,X_train,y_train,X_test,y_test):
clf.fit(X_train,y_train)
train_pred = clf.predict_proba(X_train)[:, 1]
test_pred = clf.predict_proba(X_test)[:, 1]
accuracy_train = clf.score(X_train,y_train )
accuracy_test = clf.score(X_test,y_test)
auc_train = metrics.roc_auc_score(y_train, train_pred)
auc_test = metrics.roc_auc_score(y_test, test_pred)
return accuracy_train,accuracy_test, auc_train, auc_test
accuracy_scores_tr = []
accuracy_scores_ts = []
auc_train_scores = []
auc_test_scores = []
X_train, X_test, y_train, y_test = train_test_split(data_proc.drop('Outcome', axis =1), data_proc['Outcome'], test_size= 0.2,
random_state = 42)
for name,clf in clfs.items():
current_accuracy_tr,current_accuracy_ts,auc_tr, auc_ts = train_classifier(clf, X_train,y_train,X_test,y_test)
print("For ",name)
print("Accuracy Train - ",current_accuracy_tr)
print("Accuracy Test - ",current_accuracy_ts)
print("AUC Train - \n",auc_tr)
print("AUC Test - \n",auc_ts)
accuracy_scores_tr.append(current_accuracy_tr)
accuracy_scores_ts.append(current_accuracy_ts)
auc_train_scores.append(auc_tr)
auc_test_scores.append(auc_ts)
print()
print("="*100)
print()
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy Test':accuracy_scores_ts,
'AUC Test':auc_test_scores,}).sort_values(by = 'AUC Test', ascending= False).style.background_gradient(cmap = 'RdBu', subset = [ 'AUC Test', 'Accuracy Test' ])
performance_df
def objective(trial):
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # assigning xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # assigning xvalid, yvalid
# Modelling
params = {'n_estimators' : trial.suggest_int('n_estimators', 10, 800)
,'max_depth' : trial.suggest_int('max_depth', 3, 300)
, 'criterion' : trial.suggest_categorical('criterion',['gini', 'entropy'])
, 'max_features' : trial.suggest_categorical('max_features',['auto', 'sqrt', 'log2'])
, 'oob_score' : trial.suggest_categorical('oob_score',[True, False])
}
model = RandomForestClassifier(**params) # Initialization of RandomForestClassifier Class
model.fit(xtrain,ytrain) # Training the Model on training set
# Predictions and Evaluation
preds_valid = model.predict_proba(xvalid)[:, 1] # Validating the model on Validation data.
roc_auc_score = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating the model using roc_aucc_curve
print(fold, roc_auc_score)
return roc_auc_score
study_rf = optuna.create_study(direction='maximize') # direction = "maximize", optuna will try to maximize the roc_auc_score
study_rf.optimize(objective, n_trials=100) # Running our objective function for 15 trials.
trial_rf = study_rf.best_trial
print(f'roc_auc_score: {trial_rf.value}')
print(f"Best hyperparameters: {trial_rf.params}")
optuna.visualization.plot_optimization_history(study_rf)
optuna.visualization.plot_slice(study_rf)
final_test_predictions_rf = [] #to store final test predictions
final_valid_predictions_rf = [] # to store final validation predictions
scores = [] #to store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
# params = trial_rf.params
params = {'n_estimators': 235, 'max_depth': 137, 'criterion': 'entropy', 'max_features': 'log2', 'oob_score': True}
model_rf = RandomForestClassifier(**params) # instantiate RandomForestClassifier Class
model_rf.fit(xtrain, ytrain) # Training the model on training set
preds_valid = model_rf.predict_proba(xvalid)[:, 1] # Predicting Validation set
test_preds = model_rf.predict_proba(xtest)[:, 1] # Predicting Test Set
final_test_predictions_rf.append(test_preds) # Appending test predictions to list
final_valid_predictions_rf.append(preds_valid) # Appending valid predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating valid predictions using roc_auc_curve
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating test predictions using roc_auc_curve
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # printing fold and respective roc_auc for validation set
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # printing fold and respective roc_auc for test set
scores.append(roc_auc_score_valid) # Appending roc_auc_score to list.
print(np.mean(scores), np.std(scores)) # printing mean and standard deviation of scores
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_rf), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_rf), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_rf), axis=1)).round()))
def objective(trial):
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # assigning xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # assigning xvalid, yvalid
# Modelling
params = {'n_estimators' : trial.suggest_int('n_estimators', 10, 800)
,'max_depth' : trial.suggest_int('max_depth', 3, 300)
, 'criterion' : trial.suggest_categorical('criterion',['gini', 'entropy'])
, 'max_features' : trial.suggest_categorical('max_features',['auto', 'sqrt', 'log2'])
, 'oob_score' : trial.suggest_categorical('oob_score',[True, False])
}
model = ExtraTreesClassifier(**params, bootstrap = True) # Initialization of ExtraTreesClassifier Class
model.fit(xtrain,ytrain) # Training the Model on training set
# Predictions and Evaluation
preds_valid = model.predict_proba(xvalid)[:, 1] # Validating the model on Validation data.
roc_auc_score = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating the model using roc_aucc_curve
print(fold, roc_auc_score)
return roc_auc_score
study_etc = optuna.create_study(direction='maximize') # direction = "maximize", optuna will try to maximize the roc_auc_score
study_etc.optimize(objective, n_trials=100) # Running our objective function for 15 trials.
trial_etc = study_etc.best_trial
print(f'roc_auc_score: {trial_etc.value}')
print(f"Best hyperparameters: {trial_etc.params}")
optuna.visualization.plot_optimization_history(study_etc)
optuna.visualization.plot_slice(study_etc)
final_test_predictions_etc = [] #to store final test predictions
final_valid_predictions_etc = [] # to store final validation predictions
scores = [] #to store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
# params = trial_etc.params
params = {'n_estimators': 79, 'max_depth': 300, 'criterion': 'gini', 'max_features': 'auto', 'oob_score': True}
model_etc = ExtraTreesClassifier(**params, bootstrap = True) # instantiate ExtraTreesClassifier Class
model_etc.fit(xtrain, ytrain) # Training the model on training set
preds_valid = model_etc.predict_proba(xvalid)[:, 1] # Predicting Validation set
test_preds = model_etc.predict_proba(xtest)[:, 1] # Predicting Test Set
final_test_predictions_etc.append(test_preds) # Appending test predictions to list
final_valid_predictions_etc.append(preds_valid) # Appending valid predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating valid predictions using roc_auc_curve
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating test predictions using roc_auc_curve
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # printing fold and respective roc_auc for validation set
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # printing fold and respective roc_auc for test set
scores.append(roc_auc_score_valid) # Appending roc_auc_score to list.
print(np.mean(scores), np.std(scores)) # printing mean and standard deviation of scores
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_etc), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_etc), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_etc), axis=1)).round()))
def objective(trial):
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # assigning xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # assigning xvalid, yvalid
# Modelling
params = {
'max_depth': trial.suggest_int('max_depth', 6, 300),
'n_estimators': trial.suggest_int('n_estimators', 10, 800),
'eta': trial.suggest_float('eta', 0.007, 0.013),
'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 1e4),
'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e4),
'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e4),
'gamma': trial.suggest_loguniform('gamma', 1e-3, 1e4),
'eval_metric' : 'auc',
'objective' : 'binary:logistic',
}
model = XGBClassifier(**params) # Initialization of XGBClassifier Class
model.fit(xtrain,ytrain) # Training the Model on training set
# Predictions and Evaluation
preds_valid = model.predict_proba(xvalid)[:, 1] # Validating the model on Validation data.
roc_auc_score = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating the model using roc_aucc_curve
print(fold, roc_auc_score)
return roc_auc_score
study_xgb = optuna.create_study(direction='maximize') # direction = "maximize", optuna will try to maximize the roc_auc_score
study_xgb.optimize(objective, n_trials=100) # Running our objective function for 100 trials.
trial_xgb = study_xgb.best_trial
print(f'roc_auc_score: {trial_xgb.value}')
print(f"Best hyperparameters: {trial_xgb.params}")
optuna.visualization.plot_optimization_history(study_xgb)
optuna.visualization.plot_slice(study_xgb)
final_test_predictions_xgb = [] #to store final test predictions
final_valid_predictions_xgb = [] # to store final validation predictions
scores = [] #to store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
# params = trial_xgb.params
params = {'max_depth': 208, 'n_estimators': 616, 'eta': 0.012786688162334569, 'subsample': 0.9, 'colsample_bytree': 0.8,
'colsample_bylevel': 0.6000000000000001, 'min_child_weight': 0.009205157727051775,
'reg_lambda': 0.4237493776682705, 'reg_alpha': 0.0357459259751965, 'gamma': 0.0012862253712724995}
model_xgb = XGBClassifier(**params) # instantiate XGBClassifier Class
model_xgb.fit(xtrain, ytrain) # Training the model on training set
preds_valid = model_xgb.predict_proba(xvalid)[:, 1] # Predicting Validation set
test_preds = model_xgb.predict_proba(xtest)[:, 1] # Predicting Test Set
final_test_predictions_xgb.append(test_preds) # Appending test predictions to list
final_valid_predictions_xgb.append(preds_valid) # Appending valid predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating valid predictions using roc_auc_curve
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating test predictions using roc_auc_curve
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # printing fold and respective roc_auc for validation set
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # printing fold and respective roc_auc for test set
scores.append(roc_auc_score_valid) # Appending roc_auc_score to list.
print(np.mean(scores), np.std(scores)) # printing mean and standard deviation of scores
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_xgb), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_xgb), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_xgb), axis=1)).round()))
def objective(trial):
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # assigning xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # assigning xvalid, yvalid
# Modelling
params = {
'alpha': trial.suggest_loguniform('alpha',1e-4,1),
'hidden_layer_sizes': trial.suggest_int('hidden_layer_sizes',5,100),
'max_iter': trial.suggest_int('max_iter',30,200)}
model = MLPClassifier(**params) # Initialization of MLPClassifier Class
model.fit(xtrain,ytrain) # Training the Model on training set
# Predictions and Evaluation
preds_valid = model.predict_proba(xvalid)[:, 1] # Validating the model on Validation data.
roc_auc_score = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating the model using roc_aucc_curve
print(fold, roc_auc_score)
return roc_auc_score
study_mlp = optuna.create_study(direction='maximize') # direction = "maximize", optuna will try to maximize the roc_auc_score
study_mlp.optimize(objective, n_trials=100) # Running our objective function for 100 trials.
trial_mlp = study_mlp.best_trial
print(f'roc_auc_score: {trial_mlp.value}')
print(f"Best hyperparameters: {trial_mlp.params}")
optuna.visualization.plot_optimization_history(study_mlp)
optuna.visualization.plot_slice(study_mlp)
final_test_predictions_mlp = [] #to store final test predictions
final_valid_predictions_mlp = [] # to store final validation predictions
scores = [] #to store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
# params = trial_mlp.params
params = {'alpha': 0.001931587917122388, 'hidden_layer_sizes': 93, 'max_iter': 189}
model_mlp = MLPClassifier(**params) # instantiate MLPClassifier Class
model_mlp.fit(xtrain, ytrain) # Training the model on training set
preds_valid = model_mlp.predict_proba(xvalid)[:, 1] # Predicting Validation set
test_preds = model_mlp.predict_proba(xtest)[:, 1] # Predicting Test Set
final_test_predictions_mlp.append(test_preds) # Appending test predictions to list
final_valid_predictions_mlp.append(preds_valid) # Appending valid predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating valid predictions using roc_auc_curve
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating test predictions using roc_auc_curve
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # printing fold and respective roc_auc for validation set
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # printing fold and respective roc_auc for test set
scores.append(roc_auc_score_valid) # Appending roc_auc_score to list.
print(np.mean(scores), np.std(scores)) # printing mean and standard deviation of scores
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_mlp), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_mlp), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_mlp), axis=1)).round()))
def objective(trial):
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # assigning xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # assigning xvalid, yvalid
# Modelling
params = {'C' : trial.suggest_float('C', 0.0001, 5.0)
,'max_iter' : trial.suggest_int('max_iter', 50, 1000)
, 'kernel' : trial.suggest_categorical('kernel',['linear', 'poly', 'rbf', 'sigmoid'])
, 'gamma' : trial.suggest_categorical('gamma',['scale', 'auto'])
, 'class_weight' : trial.suggest_categorical('class_weight',['balanced'])
, 'probability' : trial.suggest_categorical('probability',[True])
}
model = SVC(**params) # Initialization of SVC Class
model.fit(xtrain,ytrain) # Training the Model on training set
# Predictions and Evaluation
preds_valid = model.predict_proba(xvalid)[:, 1] # Validating the model on Validation data.
roc_auc_score = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating the model using roc_aucc_curve
print(fold, roc_auc_score)
return roc_auc_score
study_svc = optuna.create_study(direction='maximize') # direction = "maximize", optuna will try to maximize the roc_auc_score
study_svc.optimize(objective, n_trials=100) # Running our objective function for 100 trials.
trial_svc = study_svc.best_trial
print(f'roc_auc_score: {trial_svc.value}')
print(f"Best hyperparameters: {trial_svc.params}")
optuna.visualization.plot_optimization_history(study_svc)
optuna.visualization.plot_slice(study_svc)
final_test_predictions_svc = [] #to store final test predictions
final_valid_predictions_svc = [] # to store final validation predictions
scores = [] #to store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
# params = trial_svc.params
params = {'C': 1.835976773436064, 'max_iter': 849, 'kernel': 'rbf', 'gamma': 'scale', 'class_weight': 'balanced', 'probability': True}
model_svc = SVC(**params) # instantiate SVC Class
model_svc.fit(xtrain, ytrain) # Training the model on training set
preds_valid = model_svc.predict_proba(xvalid)[:, 1] # Predicting Validation set
test_preds = model_svc.predict_proba(xtest)[:, 1] # Predicting Test Set
final_test_predictions_svc.append(test_preds) # Appending test predictions to list
final_valid_predictions_svc.append(preds_valid) # Appending valid predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating valid predictions using roc_auc_curve
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating test predictions using roc_auc_curve
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # printing fold and respective roc_auc for validation set
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # printing fold and respective roc_auc for test set
scores.append(roc_auc_score_valid) # Appending roc_auc_score to list.
print(np.mean(scores), np.std(scores)) # printing mean and standard deviation of scores
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_svc), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_svc), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_svc), axis=1)).round()))
def objective(trial):
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # assigning xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # assigning xvalid, yvalid
# Modelling
params = {'C' : trial.suggest_float('C', 0.0001, 5.0)
,'max_iter' : trial.suggest_int('max_iter', 50, 1000)
, 'penalty' : trial.suggest_categorical('penalty',['l1', 'l2'])
, 'solver' : trial.suggest_categorical('solver',['liblinear'])
}
model = LogisticRegression(**params) # Initialization of LogisticRegression Class
model.fit(xtrain,ytrain) # Training the Model on training set
# Predictions and Evaluation
preds_valid = model.predict_proba(xvalid)[:, 1] # Validating the model on Validation data.
roc_auc_score = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating the model using roc_aucc_curve
print(fold, roc_auc_score)
return roc_auc_score
study_lr = optuna.create_study(direction='maximize') # direction = "maximize", optuna will try to maximize the roc_auc_score
study_lr.optimize(objective, n_trials=100) # Running our objective function for 100 trials.
trial_lr = study_lr.best_trial
print(f'roc_auc_score: {trial_lr.value}')
print(f"Best hyperparameters: {trial_lr.params}")
optuna.visualization.plot_optimization_history(study_lr)
optuna.visualization.plot_slice(study_lr)
final_test_predictions_lr = [] # To store final test predictions
final_valid_predictions_lr = [] # To store final validation predictions
scores = [] # To store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
# params = trial_lr.params
params = {'C': 4.141877479565224, 'max_iter': 314, 'penalty': 'l2', 'solver': 'liblinear'}
model_lr = LogisticRegression(**params) # instantiate LogisticRegression Class
model_lr.fit(xtrain, ytrain) # Training the model on training set
preds_valid = model_lr.predict_proba(xvalid)[:, 1] # Predicting Validation set
test_preds = model_lr.predict_proba(xtest)[:, 1] # Predicting Test Set
final_test_predictions_lr.append(test_preds) # Appending test predictions to list
final_valid_predictions_lr.append(preds_valid) # Appending valid predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating valid predictions using roc_auc_curve
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating test predictions using roc_auc_curve
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # printing fold and respective roc_auc for validation set
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # printing fold and respective roc_auc for test set
scores.append(roc_auc_score_valid) # Appending roc_auc_score to list.
print(np.mean(scores), np.std(scores)) # printing mean and standard deviation of scores
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_lr), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_lr), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_lr), axis=1)).round()))
This type of ensemble is one of the most intuitive and easy to understand. The Voting Classifier is a homogeneous and heterogeneous type of Ensemble Learning, that is, the base classifiers can be of the same or different type. As mentioned earlier, this type of ensemble also works as an extension of bagging (e.g. Random Forest).
The architecture of a Voting Classifier is made up of a number “n” of ML models, whose predictions are valued in two different ways: hard and soft. In hard mode, the winning prediction is the one with “the most votes”. In Figure 2 we see an example of how the Voting Classifier works in hard mode.
final_test_predictions_vclf = [] #to store final test predictions
final_valid_predictions_vclf = [] # to store final validation predictions
scores = [] #to store the scores
xtest, ytest = df_test.drop('Outcome', axis=1).copy(), df_test['Outcome'].copy()
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(features,label )): # creating Stratify-5-Folds
xtrain, ytrain = features.iloc[train_indicies], label.iloc[train_indicies] # creating xtrain, ytrain
xvalid, yvalid = features.iloc[valid_indicies], label.iloc[valid_indicies] # creating xvalid and yvalid
# Model building using parameters obtain from Optuna hyper-parameter tuning
# params = trial_lr.params
model_vclf = VotingClassifier(estimators=[
('KNN', model_knn), # KNN classifier
('RF', model_rf), # RandomForest Classifier
('ETC', model_etc), # ExtraTreesClassifier
('XGB', model_xgb), # XGBosst Classifier
('MLP-NN', model_mlp), # Multi-Layered Perceptron
# ('SVC', model_svc),
],
voting='soft') # Used Soft voting
model_vclf.fit(xtrain, ytrain) # Training the model
preds_valid = model_vclf.predict_proba(xvalid)[:, 1] # Prediction on validation data
test_preds = model_vclf.predict_proba(xtest)[:, 1] # Prediction on test data
final_test_predictions_vclf.append(test_preds) # Appending test predictions to list
final_valid_predictions_vclf.append(preds_valid) # Appending validation predictions to list
roc_auc_score_valid = metrics.roc_auc_score(yvalid, preds_valid) # Evaluating validation predictions using roc_auc_score
roc_auc_score_test = metrics.roc_auc_score(ytest, test_preds) # Evaluating Test predictions using roc_auc_score
print(f'Fold {fold} AUC_valid: ', roc_auc_score_valid) # Printing fold and AUC_validation
print(f'Fold {fold} AUC_test: ', roc_auc_score_test) # Printing fold and AUC_test
scores.append(roc_auc_score_valid) # Appending validation score to list
print(np.mean(scores), np.std(scores)) # Printing Mean and Standard Deviation of Scores.
plt.figure(figsize = (8,8))
cnf_matrix=metrics.confusion_matrix(ytest,(np.mean(np.column_stack(final_test_predictions_vclf), axis=1)).round())
plot_confusion_matrix(cnf_matrix,classes=[0,1])
plt.show()
# calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(ytest, (np.mean(np.column_stack(final_test_predictions_vclf), axis=1)))
# plot no skill
plt.figure(figsize = (8,8))
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title('ROC-AUC Curve')
print(metrics.classification_report(ytest,(np.mean(np.column_stack(final_test_predictions_vclf), axis=1)).round()))